In [136]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from wordcloud import WordCloud
from matplotlib.colors import LinearSegmentedColormap

%matplotlib inline
In [93]:
Xnum_trunc = pickle.load(open('Xnum_trunc', 'rb'))
w = pickle.load(open('Xnum_w', 'rb'))
y = pickle.load(open('Xnum_y', 'rb'))
X_tsne0 =  pickle.load(open('X3_bow_ts0', 'rb'))
y_0 = pickle.load(open('y_kmeans0', 'rb'))
X_tsne1 =  pickle.load(open('X3_bow_ts1', 'rb'))
y_1 = pickle.load(open('y_kmeans1', 'rb'))
X_tsne2 =  pickle.load(open('X3_bow_ts2', 'rb'))
y_2 = pickle.load(open('y_kmeans2', 'rb'))
X_tsne3 =  pickle.load(open('X3_bow_ts3', 'rb'))
y_3 = pickle.load(open('y_kmeans3', 'rb'))
X_tsne4 =  pickle.load(open('X3_bow_ts4', 'rb'))
y_4 = pickle.load(open('y_kmeans4', 'rb'))
Out[93]:
2044
In [145]:
plt.figure(figsize=(9,15), dpi=500)
plt.suptitle('Posts Measures and Themes Clustering')
gs = gridspec.GridSpec(5, 3)
ax1 = plt.subplot(gs[:-2,:])
ax1.scatter(Xnum_trunc[:,0], -Xnum_trunc[:,1], alpha=0.8, c=y);
ax1.set_axis_off(); 
for feat, feat_name in zip(w, ['Score(log)','ViewCount(log)','AnswerCount','CommentCount','FavoriteCount']):
    ax1.arrow(0.2, -0.4, .5*feat[0], .5*feat[1], color='k', width=0.003, ec='none')
    if not feat_name.endswith('(log)'):
        ax1.text(0.22+.5*feat[0], -0.38+.5*feat[1], feat_name, ha='center', color='k')
    else:
        ax1.text(0.3+.5*feat[0], -0.4+.5*feat[1], feat_name, ha='center', color='k')
ax2 = plt.subplot(gs[3,0])
ax2.scatter(X_tsne0[:,0], X_tsne0[:,1], alpha=0.8, c=y_0, cmap='Purples');
ax2.annotate(f'{len(y_0)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax2.set_title('Mod Views, High Eng')
ax2.set_axis_off(); 
ax3 = plt.subplot(gs[3,1])
ax3.scatter(X_tsne1[:,0], X_tsne1[:,1], alpha=0.8, c=y_1, cmap='Blues');
ax3.annotate(f'{len(y_1)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax3.set_title('Mod Views, Low Eng')
ax3.set_axis_off(); 
ax4 = plt.subplot(gs[3,2])
ax4.scatter(X_tsne2[:,0], X_tsne2[:,1], alpha=0.8, c=y_2, cmap='BuGn');
ax4.annotate(f'{len(y_2)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax4.set_title('Low Views, Low Eng')
ax4.set_axis_off(); 
ax5 = plt.subplot(gs[4,0])
ax5.scatter(X_tsne3[:,0], X_tsne3[:,1], alpha=0.8, c=y_3, cmap='Greens');
ax5.annotate(f'{len(y_3)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax5.set_title('Low Views, High Eng')
ax5.set_axis_off(); 
ax6 = plt.subplot(gs[4,1])
ax6.scatter(X_tsne4[:,0], X_tsne4[:,1], alpha=0.8, c=y_4, cmap='YlOrRd');
ax6.annotate(f'{len(y_4)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax6.set_title('High Views, High Eng')
ax6.set_axis_off(); 
fig.savefig('fig1.png', dpi=500)
In [138]:
wordcloud0 =  pickle.load(open('wordcloud0', 'rb'))
wordcloud1 =  pickle.load(open('wordcloud1', 'rb'))
wordcloud2 =  pickle.load(open('wordcloud2', 'rb'))
wordcloud3 =  pickle.load(open('wordcloud3', 'rb'))
wordcloud4 =  pickle.load(open('wordcloud4', 'rb'))
In [141]:
fig, ax = plt.subplots(10, 6, figsize=(15,25), sharey=False, dpi=500)
plt.subplots_adjust(wspace=0, hspace=0)
for i in range(10):
    ax[i,0].set_axis_off();
    for j in range(1,6):
        ax[i,j].spines['top'].set_color('Grey')
        ax[i,j].spines['left'].set_color('Grey')
        ax[i,j].spines['bottom'].set_color('Grey')
        ax[i,j].spines['right'].set_color('Grey')
        ax[i,j].set_xticks([])
        ax[i,j].set_yticks([])
ax[0,1].set_title('$Hot$ $Posts$\nHi View, Hi Eng');
ax[0,2].set_title('$Trending$\nMod View, Hi Eng');
ax[0,3].set_title('$S.O.S$\nMod View, Low Eng');
ax[0,4].set_title('$Curious$ $Topics$\nLow View, Hi Eng');
ax[0,5].set_title('$Spam$\nLow View, Low Eng');
ax[0,1].imshow(wordcloud4[0], aspect='auto');
ax[9,1].imshow(wordcloud4[1], aspect='auto');
ax[1,1].imshow(wordcloud4[2], aspect='auto');
ax[5,1].imshow(wordcloud4[3], aspect='auto');
ax[6,1].imshow(wordcloud4[4], aspect='auto');
cmap_g = LinearSegmentedColormap.from_list('mycmap', ['#11644D', '#A0B046'])
ax[3,4].imshow(wordcloud3[0].recolor(colormap=cmap_g), aspect='auto');
ax[2,4].imshow(wordcloud3[1].recolor(colormap=cmap_g), aspect='auto');
ax[9,4].imshow(wordcloud3[2].recolor(colormap=cmap_g), aspect='auto');
ax[5,4].imshow(wordcloud3[3].recolor(colormap=cmap_g), aspect='auto');
ax[1,4].imshow(wordcloud3[4].recolor(colormap=cmap_g), aspect='auto');
ax[4,4].imshow(wordcloud3[5].recolor(colormap=cmap_g), aspect='auto');
cmap_p = LinearSegmentedColormap.from_list('mycmap', ['#DDA0DD', '#4B0082'])
cmap_b = LinearSegmentedColormap.from_list('mycmap', ['#107FC9', '#0B108C'])
cmap_bg = LinearSegmentedColormap.from_list('mycmap', ['#20B2AA', '#008080'])
ax[0,2].imshow(wordcloud0[0].recolor(colormap=cmap_p), aspect='auto');
ax[5,2].imshow(wordcloud0[1].recolor(colormap=cmap_p), aspect='auto');
ax[1,2].imshow(wordcloud0[2].recolor(colormap=cmap_p), aspect='auto');
ax[6,2].imshow(wordcloud0[3].recolor(colormap=cmap_p), aspect='auto');
ax[9,2].imshow(wordcloud0[4].recolor(colormap=cmap_p), aspect='auto');
ax[2,2].imshow(wordcloud0[5].recolor(colormap=cmap_p), aspect='auto');
ax[2,3].imshow(wordcloud1[0].recolor(colormap=cmap_b), aspect='auto');
ax[7,3].imshow(wordcloud1[1].recolor(colormap=cmap_b), aspect='auto');
ax[5,3].imshow(wordcloud1[2].recolor(colormap=cmap_b), aspect='auto');
ax[9,3].imshow(wordcloud1[3].recolor(colormap=cmap_b), aspect='auto');
ax[8,3].imshow(wordcloud1[4].recolor(colormap=cmap_b), aspect='auto');
ax[0,3].imshow(wordcloud1[5].recolor(colormap=cmap_b), aspect='auto');
ax[1,3].imshow(wordcloud1[6].recolor(colormap=cmap_b), aspect='auto');
ax[6,5].imshow(wordcloud2[0].recolor(colormap=cmap_bg), aspect='auto');
ax[0,5].imshow(wordcloud2[1].recolor(colormap=cmap_bg), aspect='auto');
ax[9,5].imshow(wordcloud2[2].recolor(colormap=cmap_bg), aspect='auto');
ax[5,5].imshow(wordcloud2[3].recolor(colormap=cmap_bg), aspect='auto');
ax[3,5].imshow(wordcloud2[4].recolor(colormap=cmap_bg), aspect='auto');
ax[0,0].annotate('$Pandas$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[1,0].annotate('$Numpy$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[2,0].annotate('$String$\n$Processing$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[3,0].annotate('$Lists$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[4,0].annotate('$Classes$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[5,0].annotate('$Django$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[6,0].annotate('$Matplotlib$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[7,0].annotate('$Web$\n$Interfaces$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[8,0].annotate('$Web$\n$IEngines$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[9,0].annotate('$Mix$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
Out[141]:
Text(0.3, 0.5, '$Mix$')
In [108]:
list(np.unique(y_4, return_counts=True)[1])
Out[108]:
[24, 263, 21, 25, 9]